{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# EKS CSI EBS Setup\n",
    "https://docs.aws.amazon.com/eks/latest/userguide/eks-ug.pdf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "#!pygmentize eks-ebs-storage-class.yaml"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "#!kubectl delete -f eks-ebs-storage-class.yaml"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "#!kubectl create -f eks-ebs-storage-class.yaml"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Setup Amazon EBS CSI driver"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "NAME                        PROVISIONER                                                RECLAIMPOLICY   VOLUMEBINDINGMODE      ALLOWVOLUMEEXPANSION   AGE\n",
      "gp2 (default)               kubernetes.io/aws-ebs                                      Delete          Immediate              false                  20s\n",
      "local-hostpath              openebs.io/local                                           Delete          WaitForFirstConsumer   false                  70m\n",
      "openebs-device              openebs.io/local                                           Delete          WaitForFirstConsumer   false                  100m\n",
      "openebs-hostpath            openebs.io/local                                           Delete          WaitForFirstConsumer   false                  88m\n",
      "openebs-jiva-default        openebs.io/provisioner-iscsi                               Delete          Immediate              false                  100m\n",
      "openebs-snapshot-promoter   volumesnapshot.external-storage.k8s.io/snapshot-promoter   Delete          Immediate              false                  100m\n"
     ]
    }
   ],
   "source": [
    "!kubectl get storageclass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n",
      "                                 Dload  Upload   Total   Spent    Left  Speed\n",
      "100   599  100   599    0     0   2980      0 --:--:-- --:--:-- --:--:--  2980\n"
     ]
    }
   ],
   "source": [
    "!curl -O https://raw.githubusercontent.com/kubernetes-sigs/aws-ebs-csi-driver/v0.6.0/docs/example-iam-policy.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\n",
      "  \"Version\": \"2012-10-17\",\n",
      "  \"Statement\": [\n",
      "    {\n",
      "      \"Effect\": \"Allow\",\n",
      "      \"Action\": [\n",
      "        \"ec2:AttachVolume\",\n",
      "        \"ec2:CreateSnapshot\",\n",
      "        \"ec2:CreateTags\",\n",
      "        \"ec2:CreateVolume\",\n",
      "        \"ec2:DeleteSnapshot\",\n",
      "        \"ec2:DeleteTags\",\n",
      "        \"ec2:DeleteVolume\",\n",
      "        \"ec2:DescribeAvailabilityZones\",\n",
      "        \"ec2:DescribeInstances\",\n",
      "        \"ec2:DescribeSnapshots\",\n",
      "        \"ec2:DescribeTags\",\n",
      "        \"ec2:DescribeVolumes\",\n",
      "        \"ec2:DescribeVolumesModifications\",\n",
      "        \"ec2:DetachVolume\",\n",
      "        \"ec2:ModifyVolume\"\n",
      "      ],\n",
      "      \"Resource\": \"*\"\n",
      "    }\n",
      "  ]\n",
      "}\n"
     ]
    }
   ],
   "source": [
    "!cat example-iam-policy.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\n",
      "    \"Policy\": {\n",
      "        \"PolicyName\": \"Amazon_EBS_CSI_Driver\",\n",
      "        \"PolicyId\": \"ANPATLVNRE7W2RTE2BRUD\",\n",
      "        \"Arn\": \"arn:aws:iam::231218423789:policy/Amazon_EBS_CSI_Driver\",\n",
      "        \"Path\": \"/\",\n",
      "        \"DefaultVersionId\": \"v1\",\n",
      "        \"AttachmentCount\": 0,\n",
      "        \"PermissionsBoundaryUsageCount\": 0,\n",
      "        \"IsAttachable\": true,\n",
      "        \"CreateDate\": \"2020-10-29T11:41:21Z\",\n",
      "        \"UpdateDate\": \"2020-10-29T11:41:21Z\"\n",
      "    }\n",
      "}\n"
     ]
    }
   ],
   "source": [
    "!aws iam create-policy --policy-name Amazon_EBS_CSI_Driver \\\n",
    " --policy-document file://example-iam-policy.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "policy_arn='arn:aws:iam::231218423789:policy/Amazon_EBS_CSI_Driver'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Name:         aws-auth\n",
      "Namespace:    kube-system\n",
      "Labels:       <none>\n",
      "Annotations:  <none>\n",
      "\n",
      "Data\n",
      "====\n",
      "mapRoles:\n",
      "----\n",
      "- groups:\n",
      "  - system:bootstrappers\n",
      "  - system:nodes\n",
      "  rolearn: arn:aws:iam::231218423789:role/eksctl-cluster-nodegroup-cpu-node-NodeInstanceRole-1QB4MRINE2FO9\n",
      "  username: system:node:{{EC2PrivateDNSName}}\n",
      "\n",
      "Events:  <none>\n"
     ]
    }
   ],
   "source": [
    "!kubectl -n kube-system describe configmap aws-auth"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "role_name = 'eksctl-cluster-nodegroup-cpu-node-NodeInstanceRole-1QB4MRINE2FO9'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "!aws iam attach-role-policy \\\n",
    "    --policy-arn $policy_arn \\\n",
    "    --role-name $role_name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Client Version: v1.15.10-eks-bac369\n"
     ]
    }
   ],
   "source": [
    "!kubectl version --client --short"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "serviceaccount/ebs-csi-controller-sa created\n",
      "clusterrole.rbac.authorization.k8s.io/ebs-external-attacher-role created\n",
      "clusterrole.rbac.authorization.k8s.io/ebs-external-provisioner-role created\n",
      "clusterrolebinding.rbac.authorization.k8s.io/ebs-csi-attacher-binding created\n",
      "clusterrolebinding.rbac.authorization.k8s.io/ebs-csi-provisioner-binding created\n",
      "deployment.apps/ebs-csi-controller created\n",
      "daemonset.apps/ebs-csi-node created\n",
      "csidriver.storage.k8s.io/ebs.csi.aws.com created\n"
     ]
    }
   ],
   "source": [
    "!kubectl apply -k \"github.com/kubernetes-sigs/aws-ebs-csi-driver/deploy/kubernetes/overlays/stable/?ref=master\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "# EXAMPLE CODE\n",
    "#!git clone https://github.com/kubernetes-sigs/aws-ebs-csi-driver.git"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Create Storage Class "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[94mkind\u001b[39;49;00m: StorageClass\n",
      "\u001b[94mapiVersion\u001b[39;49;00m: storage.k8s.io/v1\n",
      "\u001b[94mmetadata\u001b[39;49;00m:\n",
      "  \u001b[94mannotations\u001b[39;49;00m:\n",
      "    \u001b[94mstorageclass.kubernetes.io/is-default-class\u001b[39;49;00m: \u001b[33m\"\u001b[39;49;00m\u001b[33mtrue\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m\n",
      "  \u001b[94mname\u001b[39;49;00m: ebs-sc\n",
      "\u001b[94mprovisioner\u001b[39;49;00m: ebs.csi.aws.com\n",
      "\u001b[94mvolumeBindingMode\u001b[39;49;00m: WaitForFirstConsumer\n"
     ]
    }
   ],
   "source": [
    "!pygmentize eks-csi-ebs/storageclass.yaml"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "storageclass.storage.k8s.io/ebs-sc created\n"
     ]
    }
   ],
   "source": [
    "!kubectl apply -f eks-csi-ebs/storageclass.yaml"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "storageclass.storage.k8s.io \"ebs-sc\" deleted\n"
     ]
    }
   ],
   "source": [
    "!kubectl delete -f eks-csi-ebs/storageclass.yaml"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "NAME                        PROVISIONER                                                RECLAIMPOLICY   VOLUMEBINDINGMODE      ALLOWVOLUMEEXPANSION   AGE\n",
      "ebs-sc (default)            ebs.csi.aws.com                                            Delete          WaitForFirstConsumer   false                  57s\n",
      "local-hostpath              openebs.io/local                                           Delete          WaitForFirstConsumer   false                  83m\n",
      "openebs-device              openebs.io/local                                           Delete          WaitForFirstConsumer   false                  113m\n",
      "openebs-hostpath            openebs.io/local                                           Delete          WaitForFirstConsumer   false                  101m\n",
      "openebs-jiva-default        openebs.io/provisioner-iscsi                               Delete          Immediate              false                  113m\n",
      "openebs-snapshot-promoter   volumesnapshot.external-storage.k8s.io/snapshot-promoter   Delete          Immediate              false                  113m\n"
     ]
    }
   ],
   "source": [
    "!kubectl get storageclass"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Create Claim"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[94mapiVersion\u001b[39;49;00m: v1\n",
      "\u001b[94mkind\u001b[39;49;00m: PersistentVolumeClaim\n",
      "\u001b[94mmetadata\u001b[39;49;00m:\n",
      "  \u001b[94mname\u001b[39;49;00m: ebs-claim\n",
      "\u001b[94mspec\u001b[39;49;00m:\n",
      "  \u001b[94maccessModes\u001b[39;49;00m:\n",
      "    - ReadWriteOnce\n",
      "  \u001b[94mstorageClassName\u001b[39;49;00m: ebs-sc\n",
      "  \u001b[94mresources\u001b[39;49;00m:\n",
      "    \u001b[94mrequests\u001b[39;49;00m:\n",
      "      \u001b[94mstorage\u001b[39;49;00m: 5Gi\n"
     ]
    }
   ],
   "source": [
    "!pygmentize eks-csi-ebs/claim.yaml"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "persistentvolumeclaim/ebs-claim created\n"
     ]
    }
   ],
   "source": [
    "!kubectl apply -f eks-csi-ebs/claim.yaml"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "persistentvolumeclaim \"ebs-claim\" deleted\n",
      "^C\n"
     ]
    }
   ],
   "source": [
    "!kubectl delete -f eks-csi-ebs/claim.yaml"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Create Pod"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[04m\u001b[36m---\u001b[39;49;00m \n",
      "\u001b[94mapiVersion\u001b[39;49;00m: v1\n",
      "\u001b[94mkind\u001b[39;49;00m: Pod\n",
      "\u001b[94mmetadata\u001b[39;49;00m:\n",
      "  \u001b[94mname\u001b[39;49;00m: bert-ml-pod-eks\n",
      "\u001b[94mspec\u001b[39;49;00m:\n",
      "  \u001b[94mvolumes\u001b[39;49;00m:\n",
      "  - \u001b[94mname\u001b[39;49;00m: persistent-storage\n",
      "    \u001b[94mpersistentVolumeClaim\u001b[39;49;00m:\n",
      "      \u001b[94mclaimName\u001b[39;49;00m: ebs-claim\n",
      "  \u001b[94mcontainers\u001b[39;49;00m: \n",
      "    - \u001b[94mname\u001b[39;49;00m: bert\n",
      "      \u001b[94mcommand\u001b[39;49;00m: \n",
      "        - python\n",
      "        - /opt/ml/code/train.py\n",
      "        - --train_steps_per_epoch=1\n",
      "        - --epochs=1\n",
      "        - --learning_rate=0.00001\n",
      "        - --epsilon=0.00000001\n",
      "        - --train_batch_size=36\n",
      "        - --validation_batch_size=18\n",
      "        - --test_batch_size=18\n",
      "        - --train_steps_per_epoch=1\n",
      "        - --validation_steps=1\n",
      "        - --test_steps=1\n",
      "        - --use_xla=True\n",
      "        - --use_amp=False\n",
      "        - --max_seq_length=64\n",
      "        - --freeze_bert_layer=True\n",
      "        - --enable_sagemaker_debugger=False\n",
      "        - --enable_checkpointing=False\n",
      "        - --enable_tensorboard=False\n",
      "        - --run_validation=True\n",
      "        - --run_test=False\n",
      "        - --run_sample_predictions=True\n",
      "      \u001b[94mimage\u001b[39;49;00m: \u001b[33m\"\u001b[39;49;00m\u001b[33m231218423789.dkr.ecr.us-west-2.amazonaws.com/dlc-demo:bert\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m\n",
      "      \u001b[94mimagePullPolicy\u001b[39;49;00m: Always\n",
      "      \u001b[94menv\u001b[39;49;00m: \n",
      "        - \u001b[94mname\u001b[39;49;00m: SM_TRAINING_ENV\n",
      "          \u001b[94mvalue\u001b[39;49;00m: \u001b[33m\"\u001b[39;49;00m\u001b[33m{\u001b[39;49;00m\u001b[33m\\\"\u001b[39;49;00m\u001b[33mis_master\u001b[39;49;00m\u001b[33m\\\"\u001b[39;49;00m\u001b[33m:true}\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m\n",
      "        - \u001b[94mname\u001b[39;49;00m: SAGEMAKER_JOB_NAME\n",
      "          \u001b[94mvalue\u001b[39;49;00m: \u001b[33m\"\u001b[39;49;00m\u001b[33mtf-bert-training-eks\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m\n",
      "        - \u001b[94mname\u001b[39;49;00m: SM_CURRENT_HOST\n",
      "          \u001b[94mvalue\u001b[39;49;00m: \u001b[33m\"\u001b[39;49;00m\u001b[33mlocalhost\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m\n",
      "        - \u001b[94mname\u001b[39;49;00m: SM_NUM_GPUS\n",
      "          \u001b[94mvalue\u001b[39;49;00m: \u001b[33m\"\u001b[39;49;00m\u001b[33m0\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m\n",
      "        - \u001b[94mname\u001b[39;49;00m: SM_HOSTS\n",
      "          \u001b[94mvalue\u001b[39;49;00m: \u001b[33m\"\u001b[39;49;00m\u001b[33m{\u001b[39;49;00m\u001b[33m\\\"\u001b[39;49;00m\u001b[33mhosts\u001b[39;49;00m\u001b[33m\\\"\u001b[39;49;00m\u001b[33m:\u001b[39;49;00m\u001b[33m\\\"\u001b[39;49;00m\u001b[33mlocalhost\u001b[39;49;00m\u001b[33m\\\"\u001b[39;49;00m\u001b[33m}\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m\n",
      "        - \u001b[94mname\u001b[39;49;00m: SM_MODEL_DIR\n",
      "          \u001b[94mvalue\u001b[39;49;00m: \u001b[33m\"\u001b[39;49;00m\u001b[33mmnt/opt/ml/model/\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m     \n",
      "        - \u001b[94mname\u001b[39;49;00m: SM_OUTPUT_DIR\n",
      "          \u001b[94mvalue\u001b[39;49;00m: \u001b[33m\"\u001b[39;49;00m\u001b[33mmnt/opt/ml/output/\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m\n",
      "        - \u001b[94mname\u001b[39;49;00m: SM_OUTPUT_DATA_DIR\n",
      "          \u001b[94mvalue\u001b[39;49;00m: \u001b[33m\"\u001b[39;49;00m\u001b[33mmnt/opt/ml/output/data/\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m\n",
      "        - \u001b[94mname\u001b[39;49;00m: SM_CHANNEL_TRAIN\n",
      "          \u001b[94mvalue\u001b[39;49;00m: \u001b[33m\"\u001b[39;49;00m\u001b[33mmnt/opt/ml/input/data/train\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m\n",
      "        - \u001b[94mname\u001b[39;49;00m: SM_CHANNEL_VALIDATION\n",
      "          \u001b[94mvalue\u001b[39;49;00m: \u001b[33m\"\u001b[39;49;00m\u001b[33mmnt/opt/ml/input/data/validation\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m     \n",
      "        - \u001b[94mname\u001b[39;49;00m: SM_CHANNEL_TEST\n",
      "          \u001b[94mvalue\u001b[39;49;00m: \u001b[33m\"\u001b[39;49;00m\u001b[33mmnt/opt/ml/input/data/test\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m\n",
      "      \u001b[94mvolumeMounts\u001b[39;49;00m:\n",
      "      - \u001b[94mmountPath\u001b[39;49;00m: /opt/ml\n",
      "        \u001b[94mname\u001b[39;49;00m: persistent-storage\n",
      "  \u001b[94mrestartPolicy\u001b[39;49;00m: Never \n",
      "\u001b[37m# TODO: Need to map all of the directories above using PVs/PVCs and local host provisioner\u001b[39;49;00m\n",
      "\n",
      "\n",
      "\u001b[37m# https://docs.aws.amazon.com/sagemaker/latest/dg/amazon-sagemaker-toolkits.html\u001b[39;49;00m\n",
      "\n",
      "\u001b[37m# /opt/ml\u001b[39;49;00m\n",
      "\u001b[37m# ├── input\u001b[39;49;00m\n",
      "\u001b[37m# │   ├── config\u001b[39;49;00m\n",
      "\u001b[37m# │   │   ├── hyperparameters.json\u001b[39;49;00m\n",
      "\u001b[37m# │   │   └── resourceConfig.json\u001b[39;49;00m\n",
      "\u001b[37m# │   └── data\u001b[39;49;00m\n",
      "\u001b[37m# │       └── <channel_name>\u001b[39;49;00m\n",
      "\u001b[37m# │           └── <input data>\u001b[39;49;00m\n",
      "\u001b[37m# ├── model\u001b[39;49;00m\n",
      "\u001b[37m# │\u001b[39;49;00m\n",
      "\u001b[37m# ├── code\u001b[39;49;00m\n",
      "\u001b[37m# │\u001b[39;49;00m\n",
      "\u001b[37m# ├── output\u001b[39;49;00m\n",
      "\u001b[37m# │\u001b[39;49;00m\n",
      "\u001b[37m# └── failure\u001b[39;49;00m\n",
      "\n",
      "\n",
      "\u001b[37m# https://github.com/aws/sagemaker-tensorflow-training-toolkit/blob/d2e8848e657ebc4a3c2a71a88b31275e1c1de403/src/sagemaker_tensorflow_container/training.py#L130# https://github.com/aws/sagemaker-\u001b[39;49;00m\n",
      "\u001b[37m# https://github.com/aws/sagemaker-tensorflow-training-toolkit/blob/d2e8848e657ebc4a3c2a71a88b31275e1c1de403/src/sagemaker_tensorflow_container/training.py#L228\u001b[39;49;00m\n"
     ]
    }
   ],
   "source": [
    "!pygmentize bert-csi-ebs.yaml"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "pod/bert-ml-pod-eks created\n"
     ]
    }
   ],
   "source": [
    "!kubectl apply -f bert-csi-ebs.yaml"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Error from server (NotFound): pods \"bert-csi-ebs\" not found\n"
     ]
    }
   ],
   "source": [
    "!kubectl get pod bert-csi-ebs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Name:         bert-ml-pod-eks\n",
      "Namespace:    kubeflow\n",
      "Priority:     0\n",
      "Node:         ip-192-168-67-206.us-west-2.compute.internal/192.168.67.206\n",
      "Start Time:   Thu, 29 Oct 2020 11:58:46 +0000\n",
      "Labels:       <none>\n",
      "Annotations:  kubectl.kubernetes.io/last-applied-configuration:\n",
      "                {\"apiVersion\":\"v1\",\"kind\":\"Pod\",\"metadata\":{\"annotations\":{},\"name\":\"bert-ml-pod-eks\",\"namespace\":\"kubeflow\"},\"spec\":{\"containers\":[{\"comm...\n",
      "              kubernetes.io/psp: eks.privileged\n",
      "Status:       Pending\n",
      "IP:           \n",
      "Containers:\n",
      "  bert:\n",
      "    Container ID:  \n",
      "    Image:         231218423789.dkr.ecr.us-west-2.amazonaws.com/dlc-demo:bert\n",
      "    Image ID:      \n",
      "    Port:          <none>\n",
      "    Host Port:     <none>\n",
      "    Command:\n",
      "      python\n",
      "      /opt/ml/code/train.py\n",
      "      --train_steps_per_epoch=1\n",
      "      --epochs=1\n",
      "      --learning_rate=0.00001\n",
      "      --epsilon=0.00000001\n",
      "      --train_batch_size=36\n",
      "      --validation_batch_size=18\n",
      "      --test_batch_size=18\n",
      "      --train_steps_per_epoch=1\n",
      "      --validation_steps=1\n",
      "      --test_steps=1\n",
      "      --use_xla=True\n",
      "      --use_amp=False\n",
      "      --max_seq_length=64\n",
      "      --freeze_bert_layer=True\n",
      "      --enable_sagemaker_debugger=False\n",
      "      --enable_checkpointing=False\n",
      "      --enable_tensorboard=False\n",
      "      --run_validation=True\n",
      "      --run_test=False\n",
      "      --run_sample_predictions=True\n",
      "    State:          Waiting\n",
      "      Reason:       ContainerCreating\n",
      "    Ready:          False\n",
      "    Restart Count:  0\n",
      "    Environment:\n",
      "      SM_TRAINING_ENV:        {\"is_master\":true}\n",
      "      SAGEMAKER_JOB_NAME:     tf-bert-training-eks\n",
      "      SM_CURRENT_HOST:        localhost\n",
      "      SM_NUM_GPUS:            0\n",
      "      SM_HOSTS:               {\"hosts\":\"localhost\"}\n",
      "      SM_MODEL_DIR:           mnt/opt/ml/model/\n",
      "      SM_OUTPUT_DIR:          mnt/opt/ml/output/\n",
      "      SM_OUTPUT_DATA_DIR:     mnt/opt/ml/output/data/\n",
      "      SM_CHANNEL_TRAIN:       mnt/opt/ml/input/data/train\n",
      "      SM_CHANNEL_VALIDATION:  mnt/opt/ml/input/data/validation\n",
      "      SM_CHANNEL_TEST:        mnt/opt/ml/input/data/test\n",
      "    Mounts:\n",
      "      /opt/ml from persistent-storage (rw)\n",
      "      /var/run/secrets/kubernetes.io/serviceaccount from default-token-nnkjh (ro)\n",
      "Conditions:\n",
      "  Type              Status\n",
      "  Initialized       True \n",
      "  Ready             False \n",
      "  ContainersReady   False \n",
      "  PodScheduled      True \n",
      "Volumes:\n",
      "  persistent-storage:\n",
      "    Type:       PersistentVolumeClaim (a reference to a PersistentVolumeClaim in the same namespace)\n",
      "    ClaimName:  ebs-claim\n",
      "    ReadOnly:   false\n",
      "  default-token-nnkjh:\n",
      "    Type:        Secret (a volume populated by a Secret)\n",
      "    SecretName:  default-token-nnkjh\n",
      "    Optional:    false\n",
      "QoS Class:       BestEffort\n",
      "Node-Selectors:  <none>\n",
      "Tolerations:     node.kubernetes.io/not-ready:NoExecute for 300s\n",
      "                 node.kubernetes.io/unreachable:NoExecute for 300s\n",
      "Events:\n",
      "  Type    Reason                  Age   From                     Message\n",
      "  ----    ------                  ----  ----                     -------\n",
      "  Normal  Scheduled               7s    default-scheduler        Successfully assigned kubeflow/bert-ml-pod-eks to ip-192-168-67-206.us-west-2.compute.internal\n",
      "  Normal  SuccessfulAttachVolume  5s    attachdetach-controller  AttachVolume.Attach succeeded for volume \"pvc-12f27651-af46-4dfb-a181-5c768c87add9\"\n"
     ]
    }
   ],
   "source": [
    "!kubectl describe pod bert-csi-ebs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# pvc-12f27651-af46-4dfb-a181-5c768c87add9"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "python: can't open file '/opt/ml/code/train.py': [Errno 2] No such file or directory\n"
     ]
    }
   ],
   "source": [
    "!kubectl logs -f bert-csi-ebs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Name:            ebs-sc\n",
      "IsDefaultClass:  Yes\n",
      "Annotations:     kubectl.kubernetes.io/last-applied-configuration={\"apiVersion\":\"storage.k8s.io/v1\",\"kind\":\"StorageClass\",\"metadata\":{\"annotations\":{\"storageclass.kubernetes.io/is-default-class\":\"true\"},\"name\":\"ebs-sc\"},\"provisioner\":\"ebs.csi.aws.com\",\"volumeBindingMode\":\"WaitForFirstConsumer\"}\n",
      ",storageclass.kubernetes.io/is-default-class=true\n",
      "Provisioner:           ebs.csi.aws.com\n",
      "Parameters:            <none>\n",
      "AllowVolumeExpansion:  <unset>\n",
      "MountOptions:          <none>\n",
      "ReclaimPolicy:         Delete\n",
      "VolumeBindingMode:     WaitForFirstConsumer\n",
      "Events:                <none>\n"
     ]
    }
   ],
   "source": [
    "!kubectl describe storageclass ebs-sc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "NAME                                       CAPACITY   ACCESS MODES   RECLAIM POLICY   STATUS   CLAIM                          STORAGECLASS     REASON   AGE\n",
      "pvc-097f3863-4fa9-4303-9195-d84d51a89985   20Gi       RWO            Delete           Bound    kubeflow/mysql-pv-claim        gp2                       53d\n",
      "pvc-12f27651-af46-4dfb-a181-5c768c87add9   5Gi        RWO            Delete           Bound    kubeflow/ebs-claim             ebs-sc                    36m\n",
      "pvc-36899807-7262-46fe-9627-5b3338abd535   10Gi       RWO            Delete           Bound    kubeflow/katib-mysql           gp2                       53d\n",
      "pvc-47f293c3-6406-4e1d-ac68-97190bbe8e5f   10Gi       RWO            Delete           Bound    anonymous/workspace-notebook   gp2                       53d\n",
      "pvc-64da95de-d5fc-4df6-b449-19439dbc7345   20Gi       RWO            Delete           Bound    kubeflow/minio-pv-claim        gp2                       53d\n",
      "pvc-a252cad0-6cc0-4f08-8c81-d0e257849351   5G         RWO            Delete           Bound    kubeflow/local-hostpath-pvc    local-hostpath            119m\n",
      "pvc-ca36d136-cbfb-4466-9152-2cd389f32fef   10Gi       RWO            Delete           Bound    kubeflow/metadata-mysql        gp2                       53d\n"
     ]
    }
   ],
   "source": [
    "!kubectl get pv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Name:              pvc-12f27651-af46-4dfb-a181-5c768c87add9\n",
      "Labels:            <none>\n",
      "Annotations:       pv.kubernetes.io/provisioned-by: ebs.csi.aws.com\n",
      "Finalizers:        [kubernetes.io/pv-protection external-attacher/ebs-csi-aws-com]\n",
      "StorageClass:      ebs-sc\n",
      "Status:            Bound\n",
      "Claim:             kubeflow/ebs-claim\n",
      "Reclaim Policy:    Delete\n",
      "Access Modes:      RWO\n",
      "VolumeMode:        Filesystem\n",
      "Capacity:          5Gi\n",
      "Node Affinity:     \n",
      "  Required Terms:  \n",
      "    Term 0:        topology.ebs.csi.aws.com/zone in [us-west-2d]\n",
      "Message:           \n",
      "Source:\n",
      "    Type:              CSI (a Container Storage Interface (CSI) volume source)\n",
      "    Driver:            ebs.csi.aws.com\n",
      "    VolumeHandle:      vol-0e9e845913141f9db\n",
      "    ReadOnly:          false\n",
      "    VolumeAttributes:      storage.kubernetes.io/csiProvisionerIdentity=1603971912642-8081-ebs.csi.aws.com\n",
      "Events:                <none>\n"
     ]
    }
   ],
   "source": [
    "!kubectl describe pv pvc-12f27651-af46-4dfb-a181-5c768c87add9"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ebs_volume='vol-0e9e845913141f9db'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "error: cannot exec into a container in a completed pod; current phase is Succeeded\n"
     ]
    }
   ],
   "source": [
    "!kubectl exec -it bert-csi-ebs ls /opt/ml/model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\n",
      "    \"Volumes\": [\n",
      "        {\n",
      "            \"Attachments\": [],\n",
      "            \"AvailabilityZone\": \"us-west-2d\",\n",
      "            \"CreateTime\": \"2020-10-29T11:58:37.002Z\",\n",
      "            \"Encrypted\": false,\n",
      "            \"Size\": 5,\n",
      "            \"SnapshotId\": \"\",\n",
      "            \"State\": \"available\",\n",
      "            \"VolumeId\": \"vol-0e9e845913141f9db\",\n",
      "            \"Iops\": 100,\n",
      "            \"Tags\": [\n",
      "                {\n",
      "                    \"Key\": \"CSIVolumeName\",\n",
      "                    \"Value\": \"pvc-12f27651-af46-4dfb-a181-5c768c87add9\"\n",
      "                }\n",
      "            ],\n",
      "            \"VolumeType\": \"gp2\",\n",
      "            \"MultiAttachEnabled\": false\n",
      "        }\n",
      "    ]\n",
      "}\n"
     ]
    }
   ],
   "source": [
    "!aws ec2 describe-volumes \\\n",
    "    --volume-ids vol-0e9e845913141f9db"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "conda_python3",
   "language": "python",
   "name": "conda_python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
